package com.openkm.extractor; import java.io.BufferedInputStream; import java.io.CharArrayReader; import java.io.CharArrayWriter; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.jackrabbit.extractor.AbstractTextExtractor; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; import org.apache.pdfbox.util.PDFTextStripper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.openkm.core.Config; import com.openkm.util.FileUtils; /** * Text extractor for Portable Document Format (PDF). */ public class PdfTextExtractor extends AbstractTextExtractor { /** * Logger instance. */ private static final Logger log = LoggerFactory.getLogger(PdfTextExtractor.class); /** * Force loading of dependent class. */ static { PDFParser.class.getName(); } /** * Creates a new <code>PdfTextExtractor</code> instance. */ public PdfTextExtractor() { super(new String[]{ "application/pdf" }); } //-------------------------------------------------------< TextExtractor > /** * {@inheritDoc} */ @SuppressWarnings("rawtypes") public Reader extractText(InputStream stream, String type, String encoding) throws IOException { try { PDFParser parser = new PDFParser(new BufferedInputStream(stream)); try { parser.parse(); PDDocument document = parser.getPDDocument(); CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); String st = writer.toString().trim(); log.debug("TextStripped: '{}'", st); if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) { log.warn("PDF does not contains text layer"); // Extract images from PDF List pages = document.getDocumentCatalog().getAllPages(); StringBuilder sb = new StringBuilder(); for (Iterator itPg = pages.iterator(); itPg.hasNext(); ) { PDPage page = (PDPage) itPg.next(); PDResources resources = page.getResources(); Map images = resources.getImages(); if (images != null) { for (Iterator itImg = images.keySet().iterator(); itImg.hasNext(); ) { String key = (String) itImg.next(); PDXObjectImage image = (PDXObjectImage) images.get(key); File pdfImg = File.createTempFile(key, "." + image.getSuffix()); log.debug("Writing image: {}", pdfImg.getPath()); image.write2file(pdfImg); String txt = new CuneiformTextExtractor().doOcr(pdfImg); sb.append(txt).append(" "); log.debug("OCR Extracted: {}", txt); FileUtils.deleteQuietly(pdfImg); } } } return new StringReader(sb.toString()); } else { return new CharArrayReader(writer.toCharArray()); } } finally { try { PDDocument doc = parser.getPDDocument(); if (doc != null) { doc.close(); } } catch (IOException e) { // ignore } } } catch (Exception e) { // it may happen that PDFParser throws a runtime // exception when parsing certain pdf documents log.warn("Failed to extract PDF text content", e); return new StringReader(""); } finally { stream.close(); } } }