/**
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at the
* <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a>
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Initial code contributed and copyrighted by<br>
* frentix GmbH, http://www.frentix.com
* <p>
*/
package org.olat.search.service.document.file.pdf;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.StringHelper;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.search.service.document.file.DocumentAccessException;
import org.olat.search.service.document.file.FileContent;
import org.olat.search.service.document.file.FileDocumentFactory;
/**
*
* Initial date: 19.07.2013<br>
* @author srosse, stephane.rosse@frentix.com, http://www.frentix.com
*
*/
public class PdfBoxExtractor implements PdfExtractor {
private static final OLog log = Tracing.createLoggerFor(PdfBoxExtractor.class);
@Override
public void extract(VFSLeaf document, File bufferFile)
throws IOException, DocumentAccessException {
FileContent content = extractTextFromPdf(document);
storePdfTextInBuffer(content, bufferFile);
}
private void storePdfTextInBuffer(FileContent pdfText, File pdfTextFile) throws IOException {
try(FileWriter out = new FileWriter(pdfTextFile)) {
if(StringHelper.containsNonWhitespace(pdfText.getTitle())) {
out.write(pdfText.getTitle());
out.write("\u00A0|\u00A0");
}
out.write(pdfText.getContent());
} catch(IOException e) {
throw e;
}
}
private FileContent extractTextFromPdf(VFSLeaf leaf) throws IOException, DocumentAccessException {
if (log.isDebug()) log.debug("readContent from pdf starts...");
PDDocument document = null;
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(leaf.getInputStream());
document = PDDocument.load(bis);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (Exception e) {
log.warn("PDF is encrypted. Can not read content file=" + leaf.getName());
LimitedContentWriter writer = new LimitedContentWriter(128, FileDocumentFactory.getMaxFileSize());
writer.append(leaf.getName());
writer.close();
return new FileContent(leaf.getName(), writer.toString());
}
}
String title = getTitle(document);
if (log.isDebug()) log.debug("readContent PDDocument loaded");
PDFTextStripper stripper = new PDFTextStripper();
LimitedContentWriter writer = new LimitedContentWriter(50000, FileDocumentFactory.getMaxFileSize());
stripper.writeText(document, writer);
writer.close();
return new FileContent(title, writer.toString());
} finally {
if (document != null) {
document.close();
}
if (bis != null) {
bis.close();
}
}
}
private String getTitle(PDDocument document) {
if(document != null && document.getDocumentInformation() != null) {
return document.getDocumentInformation().getTitle();
}
return null;
}
}