package org.openedit.entermedia.scanner; import java.io.IOException; import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.encryption.DocumentEncryption; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.util.PDFTextStripper; public class PdfParser { private static final Log log = LogFactory.getLog(PdfParser.class); public Parse parse(InputStream inContent) { Parse results = new Parse(); PDDocument pdf = null; try { PDFParser parser = new PDFParser(inContent); // new ByteArrayInputStream(inContent)); parser.parse(); pdf = parser.getPDDocument(); if (pdf.isEncrypted()) { DocumentEncryption decryptor = new DocumentEncryption(pdf); // Just try using the default password and move on decryptor.decryptDocument(""); } // collect text PDFTextStripper stripper = new PDFTextStripper(); //TODO: Write this out to a temp file that will be indexed seperately String text = null; String title = null; try{ text = stripper.getText(pdf); } catch(Throwable e) { log.error("Could not parse" , e); text = ""; } text = scrubChars(text); results.setText(text); results.setPages(pdf.getNumberOfPages()); // collect title PDDocumentInformation info = pdf.getDocumentInformation(); title = info.getTitle(); results.setTitle(title); if( pdf.getNumberOfPages() > 0) { PDPage page = (PDPage)pdf.getDocumentCatalog().getAllPages().get(0); PDRectangle mediaBox = page.getMediaBox(); if( mediaBox == null) { mediaBox = page.getArtBox(); } if( mediaBox != null) { results.put("width", String.valueOf(Math.round( mediaBox.getWidth()) )); results.put("height", String.valueOf(Math.round( mediaBox.getHeight()) )); } } //Thread.sleep(500); // Slow down PDF's loading } catch (CryptographyException e) { log.error("Error decrypting document. " + e); } catch (InvalidPasswordException e) { log.error("Can't decrypt document - invalid password. " + e); } catch (Exception e) { // run time exception log.error("Can't be handled as pdf document. " + e); } finally { try { if (pdf != null) pdf.close(); } catch (IOException e) { // nothing to do } } return results; } protected String scrubChars(String inVal) { StringBuffer done = new StringBuffer(inVal.length()); for (int i = 0; i < inVal.length(); i++) { char c = inVal.charAt(i); switch (c) { case '\t': case '\n': case '\r': done.append(c); //these are safe break; default: { if (c > 31) //other skip unless over 31 { done.append(c); } } } } return done.toString(); } /* public Parse getParse(Content content) throws OpenEditException { log.info("Parse " + content.getUrl()); Parse results = null; try { byte[] raw = content.getContent(); if (raw == null) { return null; } results = parse(raw); } catch (Exception e) { throw new OpenEditException(e); } return results; } */ }