/**
*
* Copyright 2009-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
//package org.opensextant.xtext.converters;
import java.io.IOException;
import java.io.StringWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.iConvert;
/**
* Retired PDF Converter.
*
* @deprecated Tika's PDF parser does a better job. The extra metadata fields I was grabbing here are not that useful.
* Handling of encrypted PDF documents is still suspect.
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class PDFConverter implements iConvert {
private PDFTextStripper stripper = null;
/**
* Initialize a reusable PDF engine.
* @throws java.io.IOException
*/
public PDFConverter() throws IOException {
stripper = new PDFTextStripper();
}
@Override
public synchronized ConvertedDocument convert(String data) throws IOException {
throw new IOException("PDF conversion as text blob is not supported here. Send a File obj");
}
/**
* Implementation is informed by PDFBox authors.
*
* @param doc
* @return
* @throws IOException
*/
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Adapted from LucenePDFDocument.java from PDFBox lucene project
*
* This class is used to create a document for the lucene search engine.
* This should easily plug into the IndexHTML or IndexFiles that comes
* with the lucene project. This class will populate the following
* fields.
* <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
* <tr>
* <td>path</td> <td>File system path if loaded from a file</td> </tr>
* <tr>
* <td>url</td> <td>URL to PDF document</td> </tr> <tr>
* <td>contents</td>
* <td>Entire contents of PDF document, indexed but not stored</td>
* </tr>
* <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
* <tr>
* <td>modified</td> <td>The modified date/time according to the url or
* path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
* Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
* <td>From PDF meta-data if available</td> </tr> <tr>
* <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
* </table>
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.23 $
*
* @throws IOException If there is an error parsing the document.
*/
PDDocument pdfDocument = null;
ConvertedDocument textdoc = new ConvertedDocument(doc);
try {
pdfDocument = PDDocument.load(doc);
if (pdfDocument.isEncrypted()) {
//Just try using the default password and move on
// Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
textdoc.addProperty("encrypted", "YES");
}
//create a writer where to append the text content.
StringWriter writer = new StringWriter();
stripper.resetEngine();
stripper.writeText(pdfDocument, writer);
PDDocumentInformation info = pdfDocument.getDocumentInformation();
if (info != null) {
textdoc.addAuthor(info.getAuthor());
try {
textdoc.addCreateDate(info.getCreationDate());
} catch (IOException io) {
//ignore, bad date but continue with indexing
}
textdoc.addProperty("creator_tool", info.getCreator());
textdoc.addProperty("keywords", info.getKeywords());
/* try {
metadata.add("ModificationDate", info.getModificationDate());
} catch (IOException io) {
//ignore, bad date but continue with indexing
} */
//metadata.add("Producer", info.getProducer());
textdoc.addProperty("subject", info.getSubject());
String ttl = info.getTitle();
if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
ttl = textdoc.filename;
}
textdoc.addTitle(ttl);
// metadata.add("Trapped", info.getTrapped());
// TODO: Character set is what?
textdoc.setEncoding("UTF-8");
}
// Note: the buffer to string operation is costless;
// the char array value of the writer buffer and the content string
// is shared as long as the buffer content is not modified, which will
// not occur here.
textdoc.setText(writer.getBuffer().toString());
return textdoc;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
}