package org.bbaw.wsp.cms.dochandler.parser.text.parser; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.tika.parser.pdf.PDFParser; import org.bbaw.wsp.cms.dochandler.parser.document.PdfDocument; import org.bbaw.wsp.cms.document.MetadataRecord; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; /** * This class parses a PDF file. It now uses Apache PDFBox. It uses the * Singleton pattern. Only one instance can exist. * * @author Sascha Feldmann (wsp-shk1) * @date 08.08.2012 * @version 2.0 * */ public class PdfParserImpl extends ResourceParser { private static PdfParserImpl instance; /** * Return the only existing instance. The instance uses an Apache PdfBox * stripper. * * @return */ public static PdfParserImpl getInstance() { if (instance == null) { return new PdfParserImpl(); } return instance; } // Protected because this parser may get extended protected PdfParserImpl() { super(new PDFParser()); } /** * Parse a pdf-document and return the object returned by the * {@link ISaveStrategy} . * * @return Object returned by the {@link ISaveStrategy} * @throws ApplicationException * @throws IllegalArgumentException * if the uri is null or empty. * @throws IllegalStateException * if the {@link ISaveStrategy} wasn't set before. */ public Object parse(final String startUri, final String uri) throws ApplicationException { if (uri == null || uri.isEmpty()) { throw new IllegalArgumentException("The value for the parameter parser in the method parse() in PdfParserImpl mustn't be empty."); } if (this.saveStrategy == null) { throw new IllegalStateException("You must define a saveStategy before calling the parse()-method in ResourceParser."); } try { PDDocument document; InputStream input = this.resourceReader.read(uri); document = PDDocument.load(input); List<String> pagesTexts = new ArrayList<String>(); String text = ""; PDFTextStripper stripper = new PDFTextStripper(); for (int i = 1; i <= document.getNumberOfPages(); i++) { stripper.setStartPage(i); stripper.setEndPage(i); text = stripper.getText(document);; pagesTexts.add(text); } document.close(); input.close(); PdfDocument doc = (PdfDocument) this.saveStrategy.generateDocumentModel(uri, uri, pagesTexts); doc.setMetadata(new MetadataRecord()); // Set the standard metadata (page // count, mimetype,...) return doc; } catch (IOException e) { throw new ApplicationException("Problem while parsing file " + uri + " -- exception: " + e.getMessage() + "\n"); } } }