/* * regain/Thumbnailer - A file search engine providing plenty of formats (Plugin) * Copyright (C) 2011 Come_IN Computerclubs (University of Siegen) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Contact: Come_IN-Team <come_in-team@listserv.uni-siegen.de> */ package de.uni_siegen.wineme.come_in.thumbnailer.plugin; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Map; import java.util.zip.DataFormatException; import org.apache.log4j.Logger; import org.apache.lucene.document.CompressionTools; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import de.uni_siegen.wineme.come_in.thumbnailer.FileDoesNotExistException; import de.uni_siegen.wineme.come_in.thumbnailer.ThumbnailerConstants; import de.uni_siegen.wineme.come_in.thumbnailer.ThumbnailerException; import de.uni_siegen.wineme.come_in.thumbnailer.ThumbnailerManager; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.JODConverterThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.JODExcelConverterThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.JODHtmlConverterThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.JODPowerpointConverterThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.JODWordConverterThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.NativeImageThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.OpenOfficeThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.PDFBoxThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.thumbnailers.ScratchThumbnailer; import de.uni_siegen.wineme.come_in.thumbnailer.util.IOUtil; import de.uni_siegen.wineme.come_in.thumbnailer.util.mime.MimeTypeDetector; import net.sf.regain.RegainException; import net.sf.regain.crawler.Crawler; import net.sf.regain.crawler.config.PreparatorConfig; import net.sf.regain.crawler.document.RawDocument; import net.sf.regain.crawler.document.WriteablePreparator; import net.sf.regain.crawler.plugin.AbstractCrawlerPlugin; /** * Integration of Thumbnailer into Regain * * In order to save ressources, the Thumbnail Library remains loaded only until the end of the crawling process. * * @author Benjamin */ public class ThumbnailerPlugin extends AbstractCrawlerPlugin implements ThumbnailerLuceneConstants, ThumbnailerConstants { /** * @var Logger instance */ private static Logger mLog = Logger.getLogger(ThumbnailerPlugin.class); /** * @var Thumbnailer Manager (can be null if deactivated) */ private ThumbnailerManager thumbnailer; /** * Keep track of all files that where generated during a crawling process, * in order to delete those which didn't reach the regain index. */ private Collection<String> generatedThumbnailsNotIndexed; /** * @var If thumbnails should be generated. * This can be deactivated due to initialization errors, or via config. */ private boolean thumbnailGenerationDeactivated = false; /** * Parameters that were found in the XML Configuration */ private int paramThumbnailWidth; private int paramThumbnailHeight; private File paramThumbnailFolder; private String paramOpenOfficeHome; private String paramOpenOfficeProfile; private int paramOpenOfficePort; private MimeTypeDetector mimeTypeDetector; /** * Initializes the plugin. * * @param config The configuration for this plugin. * @throws RegainException When the configuration has an error. */ @Override public void init(PreparatorConfig config) throws RegainException { Map<String, String> thumbnailConfig = config.getSectionWithName("thumbnailing"); String paramThumbnailFolderStr = null; if (thumbnailConfig != null) paramThumbnailFolderStr = thumbnailConfig.get("thumbnailFolder"); if (paramThumbnailFolderStr == null || paramThumbnailFolderStr.isEmpty()) { mLog.warn("Thumbnail folder is not given; using default value (thumbs/)"); paramThumbnailFolderStr = "thumbs/"; } paramThumbnailFolder = new File(paramThumbnailFolderStr); if (thumbnailConfig != null) { try { paramThumbnailWidth = Integer.parseInt(thumbnailConfig.get("imageWidth")); paramThumbnailHeight = Integer.parseInt(thumbnailConfig.get("imageHeight")); } catch (NumberFormatException e) { mLog.warn("Could not parse desired thumbnail height/width (are these really integers?); using default values (" + THUMBNAIL_DEFAULT_WIDTH + "x" + THUMBNAIL_DEFAULT_WIDTH +")", e); } } if (paramThumbnailHeight <= 0) { mLog.warn("Invalid value for thumbnail height (" + paramThumbnailHeight + "): taking default " + THUMBNAIL_DEFAULT_WIDTH + "x" + THUMBNAIL_DEFAULT_WIDTH); paramThumbnailWidth = THUMBNAIL_DEFAULT_WIDTH; paramThumbnailHeight = THUMBNAIL_DEFAULT_HEIGHT; } else if (paramThumbnailWidth <= 0) { mLog.warn("Invalid value for thumbnail width (" + paramThumbnailWidth + "): taking default " + THUMBNAIL_DEFAULT_WIDTH + "x" + THUMBNAIL_DEFAULT_WIDTH); paramThumbnailWidth = THUMBNAIL_DEFAULT_WIDTH; paramThumbnailHeight = THUMBNAIL_DEFAULT_HEIGHT; } Map<String, String> externalConfig = config.getSectionWithName("externalHelpers"); if (externalConfig != null) { paramOpenOfficeHome = externalConfig.get("openOfficeHome"); if (paramOpenOfficeHome != null && !(new File(paramOpenOfficeHome).exists()) ) { mLog.error("ERROR: Could not find OpenOffice-Installation: The specified directory does not exist. Trying to auto-detect."); paramOpenOfficeHome = null; } paramOpenOfficeProfile = externalConfig.get("openOfficeProfile"); try { if (externalConfig.get("openOfficePort") != null) paramOpenOfficePort = Integer.parseInt(externalConfig.get("openOfficePort")); } catch (NumberFormatException e) { mLog.error("ERROR: OpenOfficePort is not a number"); } } JODConverterThumbnailer.setOpenOfficeHomeFolder(paramOpenOfficeHome); JODConverterThumbnailer.setOpenOfficeProfileFolder(paramOpenOfficeProfile); JODConverterThumbnailer.setOpenOfficePort(paramOpenOfficePort); mimeTypeDetector = new MimeTypeDetector(); generatedThumbnailsNotIndexed = new HashSet<String>(); } /** * Called before the crawling process starts (Crawler::run()). * Initialize Thumbnail Generation: * load Config and configure Thumbnailer * * This may be called multiple times during the lifetime of a plugin instance, * but CrawlerPlugin::onFinishCrawling() is always called in between. * * @param crawler The crawler instance that is about to begin crawling */ public void onStartCrawling(Crawler crawler) { if (thumbnailGenerationDeactivated) { mLog.info("Do not start thumbnail generation (deactivated)"); return; } mLog.info("Initialize Thumbnail Generation..."); thumbnailer = new ThumbnailerManager(); thumbnailer.setImageSize(paramThumbnailWidth, paramThumbnailHeight, 0); try { thumbnailer.setThumbnailFolder(paramThumbnailFolder); } catch (FileDoesNotExistException e) { mLog.error("Could not set Thumbnail Directory:", e); thumbnailer = null; mLog.warn("Thumbnailer Plugin was deactivated due to prior errors"); return; } try { thumbnailer.registerThumbnailer(new NativeImageThumbnailer()); thumbnailer.registerThumbnailer(new OpenOfficeThumbnailer()); thumbnailer.registerThumbnailer(new PDFBoxThumbnailer()); try { thumbnailer.registerThumbnailer(new JODWordConverterThumbnailer()); thumbnailer.registerThumbnailer(new JODExcelConverterThumbnailer()); thumbnailer.registerThumbnailer(new JODPowerpointConverterThumbnailer()); thumbnailer.registerThumbnailer(new JODHtmlConverterThumbnailer()); } catch (IOException e) { mLog.error("Could not initialize JODConverter:", e); } thumbnailer.registerThumbnailer(new ScratchThumbnailer()); } catch (RuntimeException e) { mLog.error("Not all thumbnailers could be registered:", e); } } /** * Called after the crawling process has finished or aborted (because of an exception): * Close the Thumbnail Generator. * * This may be called multiple times during the lifetime of a plugin instance. * * @param crawler The crawler instance that is about to finish crawling */ public void onFinishCrawling(Crawler crawler) { if (thumbnailer == null) return; if (!generatedThumbnailsNotIndexed.isEmpty()) { mLog.info("Delete all thumbnails whose documents were not added to the index..."); for (String location : generatedThumbnailsNotIndexed) { deleteThumbnail(location); } generatedThumbnailsNotIndexed.clear(); } mLog.info("De-Initialize Thumbnail Generation..."); thumbnailer.close(); thumbnailer = null; } /** * Called when a document is deleted from the index: * Delete the created thumbnail * * Note that when being replaced by another document ("update index"), * the old document is added to index first, deleting is part of the cleaning-up-at-the-end-Phase. * * @param doc Document to read * @param index Luce Index Reader */ public void onDeleteIndexEntry(Document doc, IndexReader index) { String location = getLuceneField(doc, LUCENE_FIELD_NAME_FILE_LOCATION); deleteThumbnail(location); } /** * Delete Thumbnail * @param location Specified thumbnail location * @return If deleted successfully. */ private boolean deleteThumbnail(String location) { if (location == null || location.isEmpty()) return false; File thumbnail = new File(paramThumbnailFolder, location); if (thumbnail.exists()) { mLog.info("Deleting thumbnail " + thumbnail.getName() + "..."); if (!thumbnail.delete()) { mLog.warn("Couldn't delete thumbnail " + thumbnail.getName() + " - will delete it on program exit."); thumbnail.deleteOnExit(); return false; } return true; } return false; } /** * Get a field that was added as "addional field" before. * * @param doc Lucene Index entry * @param fieldname Index entry name * @return String that was in this column */ protected String getLuceneField(Document doc, String fieldname) { String location; try { byte[] compressedData = doc.getBinaryValue(fieldname); if (compressedData == null) return null; location = CompressionTools.decompressString(compressedData); // uncompressed: // location = field.stringValue(); } catch (DataFormatException e) { mLog.error("Compressed field could not be decompressed."); return null; } return location; } /** * Called after a document is being prepared / parsed: * Create the thumbnail and add the information about its creation to the lucene entry. * * @param document Regain document that was analysed * @param preparator Preparator that has analysed this document */ public void onAfterPrepare(RawDocument document, WriteablePreparator preparator) { String thumbnailerStatus = ""; String thumbnailLocation = ""; if (!thumbnailGenerationDeactivated && thumbnailer != null) { // Fetch input file File input; try { input = document.getContentAsFile(); } catch (RegainException e) { mLog.error("File could not be thumbnailed: input file could not be retrieved from Regain", e); return; } // Get Mime String mimeType = mimeTypeDetector.getMimeType(input); mLog.debug("Detected Mime-Typ: " + mimeType); document.setMimeType(mimeType); // Generate Thumbnail File output = thumbnailer.chooseThumbnailFilename(input, true); try { thumbnailer.generateThumbnail(input, output, mimeType); thumbnailLocation = IOUtil.getRelativeFilename(paramThumbnailFolder, output); generatedThumbnailsNotIndexed.add(thumbnailLocation); mLog.info("Generated Thumbnail at " + thumbnailLocation); thumbnailerStatus = LUCENE_FIELD_VALUE_STATUS_OK; } catch (IOException e) { mLog.error("File could not be thumbnailed: ", e); thumbnailerStatus = LUCENE_FIELD_VALUE_STATUS_FAILED; } catch (ThumbnailerException e) { thumbnailerStatus = LUCENE_FIELD_VALUE_STATUS_NO_THUMBNAILER_FOUND; mLog.error("File could not be thumbnailed: ", e); } } // Add infos to Lucene index preparator.addAdditionalField(LUCENE_FIELD_NAME_STATUS, thumbnailerStatus); preparator.addAdditionalField(LUCENE_FIELD_NAME_FILE_LOCATION, thumbnailLocation); } /** * Called when a document as added to the index. * This may be a newly indexed document, or a document that has changed since * and, thus, is reindexed. * * @param doc Document to write * @param index Lucene Index Writer */ public void onCreateIndexEntry(Document doc, IndexWriter index) { String location = getLuceneField(doc, LUCENE_FIELD_NAME_FILE_LOCATION); generatedThumbnailsNotIndexed.remove(location); } }