/**
*
*/
package uk.bl.wa.parsers;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.preflight.PreflightDocument;
import org.apache.pdfbox.preflight.ValidationResult;
import org.apache.pdfbox.preflight.ValidationResult.ValidationError;
import org.apache.pdfbox.preflight.exception.SyntaxValidationException;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import uk.bl.wa.util.InputStreamDataSource;
/**
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class ApachePreflightParser extends AbstractParser {
/** */
private static final long serialVersionUID = 710873621129254338L;
/** */
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("pdf")
)));
public static final Property PDF_PREFLIGHT_VALID = Property.internalBoolean("PDF-A-PREFLIGHT-VALID");
public static final Property PDF_PREFLIGHT_ERRORS = Property.internalTextBag("PDF-A-PREFLIGHT-ERRORS");
/* (non-Javadoc)
* @see org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser.ParseContext)
*/
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
/* (non-Javadoc)
* @see org.apache.tika.parser.Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext)
*/
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Attempt to reduce logging of stacktraces:
//System.setProperty("log4j.logger.org.apache.pdfbox","");
// Set up the validation result:
ValidationResult result = null;
InputStreamDataSource isds = new InputStreamDataSource(stream);
PreflightParser parser = new PreflightParser(isds);
PreflightDocument document = null;
try {
/* Parse the PDF file with PreflightParser that inherits from the NonSequentialParser.
* Some additional controls are present to check a set of PDF/A requirements.
* (Stream length consistency, EOL after some Keyword...)
*/
parser.parse();
/* Once the syntax validation is done,
* the parser can provide a PreflightDocument
* (that inherits from PDDocument)
* This document process the end of PDF/A validation.
*/
document = parser.getPreflightDocument();
document.validate();
// Get validation result
result = document.getResult();
} catch (SyntaxValidationException e) {
/*
* the parse method can throw a SyntaxValidationExceptionif the PDF
* file can't be parsed.
*
* In this case, the exception contains an instance of
* ValidationResult
*/
result = e.getResult();
} catch (Exception e) {
// Otherwise, a NULL result:
result = null;
} finally {
// Ensure the document is always closed:
if (document != null)
document.close();
}
// display validation result
Set<String> rs = new HashSet<String>();
if (result != null && result.isValid()) {
//System.out.println("The resource is not a valid PDF/A-1b file");
metadata.set( PDF_PREFLIGHT_VALID, Boolean.TRUE.toString() );
} else {
//System.out.println("The resource is not valid, error(s) :");
metadata.set( PDF_PREFLIGHT_VALID, Boolean.FALSE.toString() );
if (result != null) {
for (ValidationError error : result.getErrorsList()) {
// System.out.println(error.getErrorCode() + " : " +
// error.getDetails());
rs.add(error.getErrorCode() + " : " + error.getDetails());
}
}
}
metadata.set( PDF_PREFLIGHT_ERRORS , rs.toArray( new String[] {} ));
}
}