package org.cdlib.xtf.textIndexer;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.cdlib.xtf.util.*;
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
/** This class provides a single static {@link PDFToString#convert(InputStream) convert() }
* method that converts the text in a PDF file into an XML string that can be
* pre-filtered and added to a Lucene database by the
* {@link XMLTextProcessor } class. <br><br>
*
* Internally, the text of the PDF file is extracted using the PDFBox library.
*/
public class PDFToString
{
static boolean mustConfigureLogger = true;
/** PDFBox text stripper. Created once to save time. */
static PDFTextStripper stripper;
//////////////////////////////////////////////////////////////////////////////
/** Convert a PDF file into an XML string.
*
* @param PDFInputStream The stream of PDF data to convert to an
* XML string.
*
* @return
* If successful, a string containing the XML equivalent of the source
* PDF file. If an error occurred, this method returns <code>null</code>.
*
*/
static String convert(InputStream PDFInputStream)
throws IOException
{
// Make a stripper if we haven't already.
if (stripper == null)
stripper = new PDFTextStripper();
// Workaround: using PDFTextStripper normally results in a Window
// being created. However, since we're running in a servlet container, this
// isn't generally desirable (and often isn't possible.) So we let AWT know
// that it's running in "headless" mode, and this prevents the window from
// being created.
//
System.setProperty("java.awt.headless", "true");
XMLFormatter formatter = new XMLFormatter();
try
{
PDDocument pdfDoc = null;
try
{
// Get hold of the PDF document to convert.
pdfDoc = PDDocument.load(PDFInputStream);
// If the document is encrypted, we've got a problem.
if (pdfDoc.isEncrypted()) {
Trace.info("*** PDF File is Encrypted. File Skipped.");
throw new Exception();
}
// Start the XML with an XML format tag.
formatter.procInstr("xml version=\"1.0\" encoding=\"utf-8\"");
// Set up the tab size and blank line formatting.
formatter.tabSize(4);
formatter.blankLineAfterTag(false);
// Determine how many pages there are in the PDF file.
int pageCount = pdfDoc.getNumberOfPages();
// Create an all-enclosing document tag summarizing
// the original document name and the number of pages.
//
formatter.beginTag("pdfDocument");
formatter.attr("pageCount", pageCount);
// Process each page in the PDF document.
for (int i = 1; i <= pageCount; i++)
{
// Start with a new page tag.
formatter.beginTag("pdfPage");
formatter.attr("number", i);
// Tell the stripper to only process the current page.
stripper.setStartPage(i);
stripper.setEndPage(i);
// Get the text for this page.
String pdfText = stripper.getText(pdfDoc);
// Escape and normalize characters.
pdfText = XMLIndexSource.normalize(pdfText);
// Tack the text onto the XML output, nicely formatted
// into lines of 128 characters or less.
//
formatter.text(pdfText, 128);
formatter.newLineAfterText();
// End the current page tag.
formatter.endTag();
} // for( int i = 1; i <= pageCount; i++ )
// End any remaining open tags (should only be the pdfDocument
// tag.)
//
formatter.endAllTags();
} // try
// If anything went wrong, say what it was.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
}
// Finally, close up the the PDF document.
finally {
if (pdfDoc != null)
pdfDoc.close();
}
} // try
// Shunt out any other exceptions.
catch (Throwable t) {
Trace.error("*** PDFToXML.convert() Exception: " + t.getClass());
Trace.error(" With message: " + t.getMessage());
}
// Return the resulting XML string to the caller.
return formatter.toString();
} // public convert()
} // class PDFToString()