package eu.europeana.cloud.service.dps.storm.transform.text.pdf; import eu.europeana.cloud.service.dps.storm.transform.text.TextExtractor; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; /** * Text extractor for PDF files that uses the Apache Tika toolkit * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class TikaExtractor implements TextExtractor { private static final Logger LOGGER = LoggerFactory.getLogger(TikaExtractor.class); private Metadata metadata; @Override public String extractText(InputStream is) { if(is == null) { LOGGER.warn("No data for extraction."); return null; } BodyContentHandler handler = new BodyContentHandler(-1); // -1 to disable the write limit metadata = new Metadata(); ParseContext pcontext = new ParseContext(); PDFParser pdfparser = new PDFParser(); try { pdfparser.parse(is, handler, metadata,pcontext); } catch (IOException | SAXException | TikaException ex) { LOGGER.warn("Can not extract text from pdf because: " + ex.getMessage()); return null; } return handler.toString(); } @Override public PdfExtractionMethods getExtractionMethod() { return PdfExtractionMethods.TIKA_EXTRACTOR; } @Override public Map<String, String> getExtractedMetadata() { if(metadata == null) { return null; } Map<String, String> ret = new HashMap<>(); for (String name : metadata.names()) { ret.put(name, metadata.get(name)); } return ret; } @Override public String getRepresentationName() { return "text-from-pdf"; } }