/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.dspace.core.ConfigurationManager;
/*
*
* to do: helpful error messages - can't find mediafilter.cfg - can't
* instantiate filter - bitstream format doesn't exist
*
*/
public class PDFFilter extends MediaFilter
{
private static Logger log = Logger.getLogger(PDFFilter.class);
public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}
/**
* @return String bundle name
*
*/
public String getBundleName()
{
return "TEXT";
}
/**
* @return String bitstreamformat
*/
public String getFormatString()
{
return "Text";
}
/**
* @return String description
*/
public String getDescription()
{
return "Extracted text";
}
/**
* @param source
* source input stream
*
* @return InputStream the resulting input stream
*/
public InputStream getDestinationStream(InputStream source)
throws Exception
{
try
{
boolean useTemporaryFile = ConfigurationManager.getBooleanProperty("pdffilter.largepdfs", false);
// get input stream from bitstream
// pass to filter, get string back
PDFTextStripper pts = new PDFTextStripper();
PDDocument pdfDoc = null;
Writer writer = null;
File tempTextFile = null;
ByteArrayOutputStream byteStream = null;
if (useTemporaryFile)
{
tempTextFile = File.createTempFile("dspacepdfextract" + source.hashCode(), ".txt");
tempTextFile.deleteOnExit();
writer = new OutputStreamWriter(new FileOutputStream(tempTextFile));
}
else
{
byteStream = new ByteArrayOutputStream();
writer = new OutputStreamWriter(byteStream);
}
try
{
pdfDoc = PDDocument.load(source);
pts.writeText(pdfDoc, writer);
}
finally
{
try
{
if (pdfDoc != null)
{
pdfDoc.close();
}
}
catch(Exception e)
{
log.error("Error closing PDF file: " + e.getMessage(), e);
}
try
{
writer.close();
}
catch(Exception e)
{
log.error("Error closing temporary extract file: " + e.getMessage(), e);
}
}
if (useTemporaryFile)
{
return new FileInputStream(tempTextFile);
}
else
{
byte[] bytes = byteStream.toByteArray();
return new ByteArrayInputStream(bytes);
}
}
catch (OutOfMemoryError oome)
{
log.error("Error parsing PDF document " + oome.getMessage(), oome);
if (!ConfigurationManager.getBooleanProperty("pdffilter.skiponmemoryexception", false))
{
throw oome;
}
}
return null;
}
}