/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tika;
import java.io.IOException;
import java.io.Writer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
* to produce a semi-structured XHTML SAX events instead of a plain text
* stream.
*/
class PDF2XHTML extends PDFTextStripper {
// TODO: remove once PDFBOX-1130 is fixed:
private boolean inParagraph = false;
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if the PDF document can not be processed
*/
public static void process(
PDDocument document, ContentHandler handler, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws SAXException, TikaException {
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content handler.
new PDF2XHTML(handler, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition).writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
}
private final XHTMLContentHandler handler;
private final boolean extractAnnotationText;
private PDF2XHTML(ContentHandler handler, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws IOException {
this.handler = new XHTMLContentHandler(handler, metadata);
this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
setSortByPosition(sortByPosition);
if (enableAutoSpace) {
setWordSeparator(" ");
} else {
setWordSeparator("");
}
// TODO: maybe expose setting these too:
//setAverageCharTolerance(1.0f);
//setSpacingTolerance(1.0f);
setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
}
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
handler.startDocument();
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a document", e);
}
}
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
handler.endDocument();
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
}
}
@Override
protected void startPage(PDPage page) throws IOException {
try {
handler.startElement("div", "class", "page");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a page", e);
}
writeParagraphStart();
}
@Override
protected void endPage(PDPage page) throws IOException {
try {
writeParagraphEnd();
// TODO: remove once PDFBOX-1143 is fixed:
if (extractAnnotationText) {
for(Object o : page.getAnnotations()) {
if ((o instanceof PDAnnotation) && PDAnnotationMarkup.SUB_TYPE_FREETEXT.equals(((PDAnnotation) o).getSubtype())) {
// It's a text annotation:
PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
String title = annot.getTitlePopup();
String subject = annot.getTitlePopup();
String contents = annot.getContents();
// TODO: maybe also annot.getRichContents()?
if (title != null || subject != null || contents != null) {
handler.startElement("div", "class", "annotation");
if (title != null) {
handler.startElement("div", "class", "annotationTitle");
handler.characters(title);
handler.endElement("div");
}
if (subject != null) {
handler.startElement("div", "class", "annotationSubject");
handler.characters(subject);
handler.endElement("div");
}
if (contents != null) {
handler.startElement("div", "class", "annotationContents");
handler.characters(contents);
handler.endElement("div");
}
handler.endElement("div");
}
}
}
}
handler.endElement("div");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
}
}
@Override
protected void writeParagraphStart() throws IOException {
// TODO: remove once PDFBOX-1130 is fixed
if (inParagraph) {
// Close last paragraph
writeParagraphEnd();
}
assert !inParagraph;
inParagraph = true;
try {
handler.startElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to start a paragraph", e);
}
}
@Override
protected void writeParagraphEnd() throws IOException {
// TODO: remove once PDFBOX-1130 is fixed
if (!inParagraph) {
writeParagraphStart();
}
assert inParagraph;
inParagraph = false;
try {
handler.endElement("p");
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a paragraph", e);
}
}
@Override
protected void writeString(String text) throws IOException {
try {
handler.characters(text);
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a string: " + text, e);
}
}
@Override
protected void writeCharacters(TextPosition text) throws IOException {
try {
handler.characters(text.getCharacter());
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a character: " + text.getCharacter(), e);
}
}
@Override
protected void writeWordSeparator() throws IOException {
try {
handler.characters(getWordSeparator());
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a space character", e);
}
}
@Override
protected void writeLineSeparator() throws IOException {
try {
handler.characters("\n");
} catch (SAXException e) {
throw new IOExceptionWithCause(
"Unable to write a newline character", e);
}
}
}