/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.app.mediafilter; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import org.apache.log4j.Logger; import org.dspace.core.ConfigurationManager; import org.dspace.core.Utils; /** * Text MediaFilter for PDF sources * * This filter produces extracted text suitable for building an index, * but not for display to end users. * It forks a process running the "pdftotext" program from the * XPdf suite -- see http://www.foolabs.com/xpdf/ * This is a suite of open-source PDF tools that has been widely ported * to Unix platforms and the ones we use (pdftoppm, pdftotext) even * run on Win32. * * This was written for the FACADE project but it is not directly connected * to any of the other FACADE-specific software. The FACADE UI expects * to find thumbnail images for 3D PDFs generated by this filter. * * Requires DSpace config properties keys: * * xpdf.path.pdftotext -- path to "pdftotext" executable (required!) * * @author Larry Stone * @see org.dspace.app.mediafilter.MediaFilter */ public class XPDF2Text extends MediaFilter { private static Logger log = Logger.getLogger(XPDF2Text.class); // Command to get text from pdf; @infile@, @COMMAND@ are placeholders private static final String XPDF_PDFTOTEXT_COMMAND[] = { "@COMMAND@", "-q", "-enc", "UTF-8", "@infile@", "-" }; // executable path that comes from DSpace config at runtime. private String pdftotextPath = null; public String getFilteredName(String oldFilename) { return oldFilename + ".txt"; } public String getBundleName() { return "TEXT"; } public String getFormatString() { return "Text"; } public String getDescription() { return "Extracted Text"; } public InputStream getDestinationStream(InputStream sourceStream) throws Exception { // get configured value for path to XPDF command: if (pdftotextPath == null) { pdftotextPath = ConfigurationManager.getProperty("xpdf.path.pdftotext"); if (pdftotextPath == null) { throw new IllegalStateException("No value for key \"xpdf.path.pdftotext\" in DSpace configuration! Should be path to XPDF pdftotext executable."); } } File sourceTmp = File.createTempFile("DSfilt",".pdf"); sourceTmp.deleteOnExit(); // extra insurance, we'll delete it here. int status = -1; try { // make local temp copy of source PDF since PDF tools // require a file for random access. // XXX fixme could optimize if we ever get an interface to grab asset *files* OutputStream sto = new FileOutputStream(sourceTmp); Utils.copy(sourceStream, sto); sto.close(); sourceStream.close(); String pdfCmd[] = XPDF_PDFTOTEXT_COMMAND.clone(); pdfCmd[0] = pdftotextPath; pdfCmd[4] = sourceTmp.toString(); log.debug("Running command: "+Arrays.deepToString(pdfCmd)); Process pdfProc = Runtime.getRuntime().exec(pdfCmd); InputStream stdout = pdfProc.getInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); Utils.copy(new BufferedInputStream(stdout), baos); stdout.close(); baos.close(); status = pdfProc.waitFor(); String msg = null; if (status == 1) { msg = "pdftotext failed opening input: file=" + sourceTmp.toString(); } else if (status == 3) { msg = "pdftotext permission failure (perhaps copying of text from this document is not allowed - check PDF file's internal permissions): file=" + sourceTmp.toString(); } else if (status != 0) { msg = "pdftotext failed, maybe corrupt PDF? status=" + String.valueOf(status); } if (msg != null) { log.error(msg); throw new IOException(msg); } return new ByteArrayInputStream(baos.toByteArray()); } catch (InterruptedException e) { log.error("Failed in pdftotext subprocess: ",e); throw e; } finally { if (!sourceTmp.delete()) { log.error("Unable to delete temporary file"); } if (status != 0) { log.error("PDF conversion proc failed, returns=" + status + ", file=" + sourceTmp); } } } }