/** * JHOVE2 - Next-generation architecture for format-aware characterization * <p> * Copyright (c) 2009 by The Regents of the University of California, Ithaka * Harbors, Inc., and The Board of Trustees of the Leland Stanford Junior * University. All rights reserved. * </p> * <p> * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * </p> * <ul> * <li>Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer.</li> * <li>Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution.</li> * <li>Neither the name of the University of California/California Digital * Library, Ithaka Harbors/Portico, or Stanford University, nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission.</li> * </ul> * <p> * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * </p> */ package org.jhove2.module.format.xml; import java.io.EOFException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jhove2.annotation.ReportableProperty; import org.jhove2.core.JHOVE2; import org.jhove2.core.JHOVE2Exception; import org.jhove2.core.Message; import org.jhove2.core.Message.Context; import org.jhove2.core.Message.Severity; import org.jhove2.core.format.Format; import org.jhove2.core.io.Input; import org.jhove2.core.source.MeasurableSource; import org.jhove2.core.source.Source; import org.jhove2.module.format.BaseFormatModule; import org.jhove2.module.format.Validator; import org.jhove2.persist.FormatModuleAccessor; import com.sleepycat.persist.model.NotPersistent; import com.sleepycat.persist.model.Persistent; /** * JHOVE2 XML module. This module parses and XML instance and captures selected * characterization information * * @author rnanders */ @Persistent public class XmlModule extends BaseFormatModule implements Validator { /** Module version identifier. */ public static final String VERSION = "2.0.0"; /** Module release date. */ public static final String RELEASE = "2010-09-10"; /** Module rights statement. */ public static final String RIGHTS = "Copyright 2010 by The Board of Trustees of the Leland Stanford Junior University. " + "Available under the terms of the BSD license."; /** Module validation coverage. */ public static final Coverage COVERAGE = Coverage.Inclusive; /** The JHOVE2 object passed in by the parse method */ @NotPersistent protected JHOVE2 jhove2; /** XML validation status. */ protected Validity validity; /** Unresolved schema reference in XML document */ protected boolean unresolvedSchemaReference; /** * Get module validation coverage. * * @return the coverage */ @Override public Coverage getCoverage() { return COVERAGE; } /** * Get source unit's validation status. * * @return the validity */ @Override public Validity isValid() { return validity; } /** * Validate the XML parse results. * * @param jhove2 * JHOVE2 framework * @param source * XML source unit * @param input XML source input * @return the validity * * @throws JHOVE2Exception * the JHOVE2 exception */ @Override public Validity validate(JHOVE2 jhove2, Source source, Input input) throws JHOVE2Exception { /* See if validity has been previously set to False, e.g. by parse exception trap */ if (validity == Validity.False) { return validity; } /* Check to see if there were SAX parser errors of any sort */ if (validationResults.fatalParserErrors.getValidationMessageCount() > 0) { Object[]messageArgs = new Object[]{"Fatal Parser Errors found"}; saxParserMessages.add(new Message(Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.validationErrorsFound", messageArgs, jhove2.getConfigInfo())); return (validity = Validity.False); } else if (validationResults.parserErrors.getValidationMessageCount() > 0) { Object[]messageArgs = new Object[]{"Parser Errors found"}; saxParserMessages.add(new Message(Severity.ERROR, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.validationErrorsFound", messageArgs, jhove2.getConfigInfo())); return (validity = Validity.False); } /* Did SAX parser find a referenced schema file that could not be resolved? */ if (this.unresolvedSchemaReference) { return (validity = Validity.Undetermined); } /* No validation errors found, but make sure schema validation was enabled, if appropriate */ if (namespaceInformation.hasSchemaLocations) { if (! saxParser.hasFeature("http://apache.org/xml/features/validation/schema")) { Object[]messageArgs = new Object[]{"Schema location(s) specified, but schema validation is disabled by SAX feature setting."}; saxParserMessages.add(new Message(Severity.WARNING, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.validationDisabled", messageArgs, jhove2.getConfigInfo())); return (validity = Validity.Undetermined); } } /* No validation errors found, but make sure validation was enabled */ if (saxParser.hasFeature("http://apache.org/xml/features/validation/dynamic")) { return (validity = Validity.True); } else if (saxParser.hasFeature("http://xml.org/sax/features/validation")) { return (validity = Validity.True); } else { Object[]messageArgs = new Object[]{"XML validation is disabled by SAX feature setting."}; saxParserMessages.add(new Message(Severity.WARNING, Context.OBJECT, "org.jhove2.module.format.xml.XmlModule.validationDisabled", messageArgs, jhove2.getConfigInfo())); return (validity = Validity.Undetermined); } } /** * Instantiates a new XmlModule instance. * * @param format * the Format object * @param formatModuleAccessor * FormatModuleAccessor to manage access to Format Profiles */ public XmlModule(Format format, FormatModuleAccessor formatModuleAccessor) { super(VERSION, RELEASE, RIGHTS, format, formatModuleAccessor); this.validity = Validity.Undetermined; } @SuppressWarnings("unused") private XmlModule(){ this(null, null); } /** The instance of a SAX2 XMLReader class used to parse XML instances. */ protected SaxParser saxParser; /** If true, run a separate parse to extract numeric character references */ protected boolean ncrParser = false; /** Data store for XML declaration information captured during the parse. */ protected XmlDeclaration xmlDeclaration = new XmlDeclaration(); /** The XML document's root element. */ protected RootElement rootElement; /** A list of the documents document scope declarations. */ protected List<DTD> dtds = new ArrayList<DTD>(); /** Data store for XML namespace information captured during the parse. */ protected NamespaceInformation namespaceInformation = new NamespaceInformation(); /** Data store for XML notation information captured during the parse. */ protected List<Notation> notations = new ArrayList<Notation>(); /** Data store for XML entity declarations captured during the parse. */ protected List<Entity> entities = new ArrayList<Entity>(); /** Data store for XML entity references captured during the parse. */ protected EntityReferences entityReferences = new EntityReferences(); /** * Data store for XML numeric character references captured during the * parse. */ protected NumericCharacterReferenceInformation numericCharacterReferenceInformation = new NumericCharacterReferenceInformation(); /** * Data store for XML processing instruction information captured during the * parse. */ protected List<ProcessingInstruction> processingInstructions = new ArrayList<ProcessingInstruction>(); /** Data store for XML comment information captured during the parse. */ protected CommentInformation commentInformation = new CommentInformation(); /** The validation results. */ protected ValidationResults validationResults = new ValidationResults(); /** Fail fast message. */ protected Message failFastMessage; /** SAX Parser messages. */ protected ArrayList<Message> saxParserMessages = new ArrayList<Message>(); /** The well formed status of the source unit. */ protected Validity wellFormed = Validity.Undetermined; /** * Sets the SaxParser object to be used for parsing the source unit. * * @param saxParser * the saxParser object */ public void setSaxParser(SaxParser saxParser) { this.saxParser = saxParser; this.saxParser.setXmlModule(this); } /** * Gets the SaxParser object used for parsing the source unit. * * @return SaxParser object */ @ReportableProperty(order = 1, value = "XML Parser name, features, properties") public SaxParser getSaxParser() { return saxParser; } /** Sets the flag that specifies whether or not to collect comment text */ public void setCollectCommentText(boolean collectCommentText) { this.commentInformation.collectCommentText = collectCommentText; } /** * Sets the class name of the parser to be used for extracting numeric * character references */ public void setNcrParser(boolean ncrParser) { this.ncrParser = ncrParser; } /** * Gets the XML Declaration data. * * @return XML Declaration data */ @ReportableProperty(order = 2, value = "XML Declaration data") public XmlDeclaration getXmlDeclaration() { return xmlDeclaration; } /** * Gets the root element name. * * @return root element name */ @ReportableProperty(order = 3, value = "The document's root element") public RootElement getRootElement() { return rootElement; } /** * Gets the list of documents document scope declarations (DTDs). * * @return list of documents document scope declarations (DTDs) */ @ReportableProperty(order = 4, value = "List of Document Scope Definitions (DTDs)") public List<DTD> getDTDs() { return dtds; } /** * Gets the list of XML namespaces. * * @return list of XML namespaces */ @ReportableProperty(order = 5, value = "XML Namespace Information") public NamespaceInformation getNamespaceInformation() { return namespaceInformation; } /** * Gets the list of XML entity declarations. * * @return list of XML entity declarations */ @ReportableProperty(order = 6, value = "List of Entity Declarations") public List<Entity> getEntities() { return entities; } /** * Gets the list of XML entity references. * * @return list of XML entity references */ @ReportableProperty(order = 7, value = "List of Entity References") public ArrayList<EntityReference> getEntityReferences() { return entityReferences.getEntityReferenceList(); } /** * Gets the list of XML notations. * * @return list of XML notations */ @ReportableProperty(order = 8, value = "List of Notations found in the XML document") public List<Notation> getNotations() { return notations; } /** * Gets the list of XML Numeric Character References. * * @return list of XML Numeric Character References */ @ReportableProperty(order = 9, value = "Numeric Character Reference Information") public NumericCharacterReferenceInformation getNumericCharacterReferenceInformation() { return numericCharacterReferenceInformation; } /** * Gets the list of XML processing instructions. * * @return list of XML processing instructions */ @ReportableProperty(order = 10, value = "List of Processing Instructions") public List<ProcessingInstruction> getProcessingInstructions() { return processingInstructions; } /** * Gets the list of XML comments. * * @return list of XML comments */ @ReportableProperty(order = 11, value = "List of Comments") public CommentInformation getCommentInformation() { return commentInformation; } /** * Gets the validation results. * * @return validation results */ @ReportableProperty(order = 12, value = "Warning or error messages generated during XML Validation") public ValidationResults getValidationResults() { return validationResults; } /** * Gets the well-formedness status. * * @return well-formedness status */ @ReportableProperty(order = 13, value = "XML well-formed status") public Validity isWellFormed() { return wellFormed; } /** * Get fail fast message. * * @return Fail fast message */ @ReportableProperty(order = 14, value = "Fail fast message.") public Message getFailFast() { return this.failFastMessage; } /** * Get SAX Parser messages. * * @return SAX Parser messages */ @ReportableProperty(order = 15, value = "SAX Parser Messages.") public List<Message> getSaxParserMessages() { return this.saxParserMessages; } /** * Parse a source unit. * * @param jhove2 * JHOVE2 framework * @param source * XML source unit * @param input * XML source input * @return Number of bytes consumed * * @throws EOFException * the EOF exception * @throws IOException * Signals that an I/O exception has occurred. * @throws JHOVE2Exception * the JHOVE2 exception */ @Override public long parse(JHOVE2 jhove2, Source source, Input input) throws IOException, JHOVE2Exception { this.jhove2 = jhove2; /* Use SAX2 to get what information is available from that mechanism */ saxParser.parse(jhove2, source, input); //source, jhove2); /* * Do a separate parse of the XML Declaration at the start of the * document */ long start = ((MeasurableSource) source).getStartingOffset(); input.setPosition(start); xmlDeclaration.parse(input); /* Do a separate parse to inventory numeric character references */ if (this.ncrParser) { input.setPosition(start); numericCharacterReferenceInformation.parse(input, xmlDeclaration.encodingFromSAX2, jhove2); } return 0; } }