/*
* (C) Copyright 2002-2010 Nuxeo SA (http://nuxeo.com/) and contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser General Public License
* (LGPL) version 2.1 which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/lgpl.html
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* Contributors:
* Julien Anguenot
* Florent Guillaume
*/
package org.nuxeo.ecm.core.convert.plugins.text.extractors;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.operator.OperatorProcessor;
import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
import org.nuxeo.ecm.core.api.impl.blob.FileBlob;
import org.nuxeo.ecm.core.api.impl.blob.StringBlob;
import org.nuxeo.ecm.core.convert.api.ConversionException;
import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder;
import org.nuxeo.ecm.core.convert.extension.Converter;
import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
public class PDF2TextConverter implements Converter {
public static class PatchedPDFTextStripper extends PDFTextStripper {
public PatchedPDFTextStripper() throws IOException {
super();
// platform independent line and paragraph separators
setLineSeparator("\n");
setParagraphEnd("\n\n");
setArticleEnd("\n\n");
}
protected Object unrestrictedAccess(String name) {
try {
Field f = PDFStreamEngine.class.getDeclaredField(name);
f.setAccessible(true);
return f.get(this);
} catch (Exception e) {
throw new RuntimeException(
"Cannot get access to PDFStreamEngine fields", e);
}
}
@SuppressWarnings("unchecked")
protected Set<String> unsupportedOperators() {
return (Set<String>) unrestrictedAccess("unsupportedOperators");
}
@SuppressWarnings("unchecked")
protected Map<String, OperatorProcessor> operators() {
return (Map<String, OperatorProcessor>) unrestrictedAccess("operators");
}
final static Set<StackTraceElement> loggedStacks = new HashSet<StackTraceElement>();
@Override
protected void processOperator(PDFOperator operator,
List<COSBase> arguments) throws IOException {
try {
String operation = operator.getOperation();
OperatorProcessor processor = operators().get(operation);
if (processor != null) {
processor.setContext(this);
processor.process(operator, arguments);
} else {
if (!unsupportedOperators().contains(operation)) {
log.info("unsupported/disabled operation: " + operation);
unsupportedOperators().add(operation);
}
}
} catch (Exception e) {
StackTraceElement root = e.getStackTrace()[0];
synchronized (loggedStacks) {
if (loggedStacks.contains(root)) {
return;
}
loggedStacks.add(root);
}
log.warn(
"Caught error in pdfbox during extraction (stack logged only once)",
e);
}
}
}
private static final Log log = LogFactory.getLog(PDF2TextConverter.class);
@Override
public BlobHolder convert(BlobHolder blobHolder,
Map<String, Serializable> parameters) throws ConversionException {
PDDocument document = null;
File f = null;
OutputStream fas = null;
try {
document = PDDocument.load(blobHolder.getBlob().getStream());
// NXP-1556: if document is protected an IOException will be raised
// Instead of catching the exception based on its message string
// lets avoid sending messages that will generate this error
// code taken from PDFTextStripper.writeText source.
// only care about standard encryption and if it was decrypted with
// the user password
AccessPermission permission = document.getCurrentAccessPermission();
if (permission.canExtractContent()) {
PatchedPDFTextStripper textStripper = new PatchedPDFTextStripper();
// use the position information to heuristically organize the
// extracted paragraphs. This is also important for
// right-to-left languages.
textStripper.setSortByPosition(true);
String text = textStripper.getText(document);
// replace non breaking space by regular spaces (why?)
// text = text.replace("\u00a0", " ");
f = File.createTempFile("pdfboplugin", ".txt");
fas = new FileOutputStream(f);
fas.write(text.getBytes("UTF-8"));
return new SimpleCachableBlobHolder(new FileBlob(
new FileInputStream(f), "text/plain", "UTF-8"));
} else {
return new SimpleCachableBlobHolder(new StringBlob(""));
}
} catch (Exception e) {
throw new ConversionException(
"Error during text extraction with PDFBox", e);
} finally {
if (document != null) {
try {
document.close();
} catch (Exception e) {
log.error("Error while closing PDFBox document", e);
}
}
if (fas != null) {
try {
fas.close();
} catch (IOException e) {
log.error(e);
}
}
if (f != null) {
f.delete();
}
}
}
@Override
public void init(ConverterDescriptor descriptor) {
}
}