/* * PDFFilter.java * * Version: $Revision: 3733 $ * * Date: $Date: 2009-04-24 03:52:11 +0000 (Fri, 24 Apr 2009) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.mediafilter; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.Writer; import java.io.File; import java.io.FileWriter; import java.io.FileInputStream; import java.io.BufferedWriter; import java.io.OutputStreamWriter; import java.io.FileOutputStream; import java.io.CharArrayWriter; import java.io.ByteArrayOutputStream; import java.util.List; import java.util.Iterator; import org.apache.log4j.Logger; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDPage; import org.pdfbox.pdmodel.common.PDStream; import org.pdfbox.util.PDFTextStripper; import org.dspace.core.ConfigurationManager; /* * * to do: helpful error messages - can't find mediafilter.cfg - can't * instantiate filter - bitstream format doesn't exist * */ public class PDFFilter extends MediaFilter { private static Logger log = Logger.getLogger(PDFFilter.class); public String getFilteredName(String oldFilename) { return oldFilename + ".txt"; } /** * @return String bundle name * */ public String getBundleName() { return "TEXT"; } /** * @return String bitstreamformat */ public String getFormatString() { return "Text"; } /** * @return String description */ public String getDescription() { return "Extracted text"; } /** * @param source * source input stream * * @return InputStream the resulting input stream */ public InputStream getDestinationStream(InputStream source) throws Exception { try { boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false); // get input stream from bitstream // pass to filter, get string back PDFTextStripper pts = new PDFTextStripper(); PDDocument pdfDoc = null; Writer writer = null; File tempTextFile = null; ByteArrayOutputStream byteStream = null; if (useTemporaryFile) { tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt"); tempTextFile.deleteOnExit(); writer = new OutputStreamWriter(new FileOutputStream(tempTextFile)); } else { byteStream = new ByteArrayOutputStream(); writer = new OutputStreamWriter(byteStream); } try { pdfDoc = PDDocument.load(source); pts.writeText(pdfDoc, writer); } finally { try { if (pdfDoc != null) pdfDoc.close(); } catch(Exception e) { log.error("Error closing PDF file: " + e.getMessage(), e); } try { writer.close(); } catch(Exception e) { log.error("Error closing temporary extract file: " + e.getMessage(), e); } } if (useTemporaryFile) { return new FileInputStream(tempTextFile); } else { byte[] bytes = byteStream.toByteArray(); return new ByteArrayInputStream(bytes); } } catch (OutOfMemoryError oome) { log.error("Error parsing PDF document " + oome.getMessage(), oome); if (!ConfigurationManager.getBooleanProperty("pdffilter.skiponmemoryexception", false)) { throw oome; } } return null; } }