package eu.europeana.cloud.service.dps.storm.transform.text.pdf;
import eu.europeana.cloud.service.dps.storm.transform.text.TextExtractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Text extractor for PDF files that uses the Apache PDFBox library
* @author Pavel Kefurt <Pavel.Kefurt@gmail.com>
*/
public class PdfBoxExtractor implements TextExtractor
{
private static final Logger LOGGER = LoggerFactory.getLogger(PdfBoxExtractor.class);
private Map<String, String> extractedMetadata;
@Override
public String extractText(InputStream is)
{
if(is == null)
{
LOGGER.warn("No data for extraction.");
return null;
}
PDFParser parser;
String parsedText = null;
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
try
{
parser = new PDFParser(is);
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation info = pdDoc.getDocumentInformation();
Set<String> mdKeys = info.getMetadataKeys();
extractedMetadata = new HashMap<>();
for (String key:mdKeys){
String value = (String)info.getPropertyStringValue(key);
extractedMetadata.put(key, value);
}
parsedText = pdfStripper.getText(pdDoc); //possible NULL pointer if document is encrypted
}
catch (IOException ex)
{
LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage());
}
finally
{
try
{
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
}
catch (IOException ex)
{}
}
return parsedText;
}
@Override
public PdfExtractionMethods getExtractionMethod()
{
return PdfExtractionMethods.PDFBOX_EXTRACTOR;
}
@Override
public Map<String, String> getExtractedMetadata()
{
return this.extractedMetadata;
// return null; //TODO: extract metadata!
}
@Override
public String getRepresentationName()
{
return "text-from-pdf";
}
}