/* * (C) Copyright 2006-2016 Nuxeo SA (http://nuxeo.com/) and others. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Julien Anguenot * Florent Guillaume */ package org.nuxeo.ecm.core.convert.plugins.text.extractors; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.Serializable; import java.lang.reflect.Field; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.util.PDFOperator; import org.apache.pdfbox.util.PDFStreamEngine; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.operator.OperatorProcessor; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.Blobs; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; import org.nuxeo.ecm.core.convert.extension.Converter; import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; import org.nuxeo.runtime.api.Framework; public class PDF2TextConverter implements Converter { public static class PatchedPDFTextStripper extends PDFTextStripper { public PatchedPDFTextStripper() throws IOException { super(); // platform independent line and paragraph separators setLineSeparator("\n"); setParagraphEnd("\n\n"); setArticleEnd("\n\n"); } protected Object unrestrictedAccess(String name) { try { Field f = PDFStreamEngine.class.getDeclaredField(name); f.setAccessible(true); return f.get(this); } catch (ReflectiveOperationException e) { throw new RuntimeException("Cannot get access to PDFStreamEngine fields", e); } } @SuppressWarnings("unchecked") protected Set<String> unsupportedOperators() { return (Set<String>) unrestrictedAccess("unsupportedOperators"); } @SuppressWarnings("unchecked") protected Map<String, OperatorProcessor> operators() { return (Map<String, OperatorProcessor>) unrestrictedAccess("operators"); } final static Set<StackTraceElement> loggedStacks = new HashSet<>(); @Override protected void processOperator(PDFOperator operator, List<COSBase> arguments) throws IOException { try { String operation = operator.getOperation(); OperatorProcessor processor = operators().get(operation); if (processor != null) { processor.setContext(this); processor.process(operator, arguments); } else { if (!unsupportedOperators().contains(operation)) { log.info("unsupported/disabled operation: " + operation); unsupportedOperators().add(operation); } } } catch (IOException e) { StackTraceElement root = e.getStackTrace()[0]; synchronized (loggedStacks) { if (loggedStacks.contains(root)) { return; } loggedStacks.add(root); } log.warn("Caught error in pdfbox during extraction (stack logged only once)", e); } } } private static final Log log = LogFactory.getLog(PDF2TextConverter.class); @Override public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { PDDocument document = null; File f = null; OutputStream fas = null; try { document = PDDocument.load(blobHolder.getBlob().getStream()); // NXP-1556: if document is protected an IOException will be raised // Instead of catching the exception based on its message string // lets avoid sending messages that will generate this error // code taken from PDFTextStripper.writeText source. // only care about standard encryption and if it was decrypted with // the user password AccessPermission permission = document.getCurrentAccessPermission(); if (permission.canExtractContent()) { PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper(); // use the position information to heuristically organize the // extracted paragraphs. This is also important for // right-to-left languages. textStripper.setSortByPosition(true); String text = textStripper.getText(document); // replace non breaking space by regular spaces (why?) // text = text.replace("\u00a0", " "); f = Framework.createTempFile("pdfboplugin", ".txt"); fas = new FileOutputStream(f); fas.write(text.getBytes("UTF-8")); try (FileInputStream is = new FileInputStream(f)) { Blob blob = Blobs.createBlob(is, "text/plain", "UTF-8"); return new SimpleCachableBlobHolder(blob); } } else { return new SimpleCachableBlobHolder(Blobs.createBlob("")); } } catch (IOException e) { throw new ConversionException("Error during text extraction with PDFBox", e); } finally { if (document != null) { try { document.close(); } catch (IOException e) { log.error("Error while closing PDFBox document", e); } } if (fas != null) { try { fas.close(); } catch (IOException e) { log.error(e); } } if (f != null) { f.delete(); } } } @Override public void init(ConverterDescriptor descriptor) { } }