/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.pdf;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.exceptions.CryptographyException;
// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.util.LogUtil;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.io.ByteArrayInputStream;
import java.io.IOException;
/*********************************************
* parser for mime type application/pdf.
* It is based on org.pdfbox.*. We have to see how well it does the job.
*
* @author John Xing
*
* Note on 20040614 by Xing:
* Some codes are stacked here for convenience (see inline comments).
* They may be moved to more appropriate places when new codebase
* stabilizes, especially after code for indexing is written.
*
*********************************************/
public class PdfParser implements Parser {
public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.pdf");
private Configuration conf;
public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
String text = null;
String title = null;
Metadata metadata = new Metadata();
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
+" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
parser.parse();
pdf = parser.getPDDocument();
if (pdf.isEncrypted()) {
//Just try using the default password and move on
pdf.openProtection(new StandardDecryptionMaterial(""));
}
// collect text
PDFTextStripper stripper = new PDFTextStripper();
text = stripper.getText(pdf);
// collect title
PDDocumentInformation info = pdf.getDocumentInformation();
title = info.getTitle();
// more useful info, currently not used. please keep them for future use.
metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
metadata.add(Metadata.AUTHOR, info.getAuthor());
metadata.add(Metadata.SUBJECT, info.getSubject());
metadata.add(Metadata.KEYWORDS, info.getKeywords());
metadata.add(Metadata.CREATOR, info.getCreator());
metadata.add(Metadata.PUBLISHER, info.getProducer());
//TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
//error here
//metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
//metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED,
"Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (BadSecurityHandlerException e) {
return new ParseStatus(ParseStatus.FAILED,
"Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: "+e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
pdf.close();
} catch (IOException e) {
// nothing to do
}
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, content.getMetadata(),
metadata);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
// format date
// currently not used. please keep it for future use.
private String formatDate(Calendar date) {
String retval = null;
if(date != null) {
SimpleDateFormat formatter = new SimpleDateFormat();
retval = formatter.format(date.getTime());
}
return retval;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}